In [1]:
import os
WORKING_DIRECTORY = os.getcwd()
print("Current working directory: {}".format(WORKING_DIRECTORY))


Current working directory: /home/gabriel/Personal/jupyter/kaggle/dogsandcats/notebooks

In [3]:
ORIGINAL_TRAIN_DIRECTORY = "../data/original_train/"
TRAIN_DIRECTORY = "../data/train/"
VALID_DIRECTORY = "../data/valid/"
TEST_DIRECTORY = "../data/test/"

IMAGE_SIZE = (360,404)
CLASSES = ['cat', 'dog']

VALIDATION_SIZE = 0.2 # size of the validation we want to use
TEST_SIZE = 0.1

Samples


In [4]:
plot_grid(imgs, titles=labels)


------------------------------------------------------------------------
NameError                              Traceback (most recent call last)
<ipython-input-4-2eb8b136eb7d> in <module>()
----> 1 plot_grid(imgs, titles=labels)

NameError: name 'plot_grid' is not defined

In [2]:
%autosave 0


Autosave disabled

Data size


In [6]:
import pandas as pd
import glob
from PIL import Image

files = glob.glob(ORIGINAL_TRAIN_DIRECTORY + '*')
df = pd.DataFrame({'fpath':files,'width':0,'height':0})
df['category'] = df.fpath.str.extract('../data/original_train/([a-zA-Z]*).', expand=False) # extract class
for idx in df.index:
    im = Image.open(df.ix[idx].fpath)
    df.ix[idx,['width','height']] = im.size

In [38]:
df.head()


Out[38]:
fpath height width category
0 ../data/original_train/dog.798.jpg 293 249 dog
1 ../data/original_train/dog.10748.jpg 301 349 dog
2 ../data/original_train/dog.8365.jpg 375 499 dog
3 ../data/original_train/cat.3227.jpg 443 291 cat
4 ../data/original_train/dog.9597.jpg 221 294 dog

In [39]:
df.describe()


Out[39]:
height width
count 25000.000000 25000.00000
mean 360.478080 404.09904
std 97.019959 109.03793
min 32.000000 42.00000
25% 301.000000 323.00000
50% 374.000000 447.00000
75% 421.000000 499.00000
max 768.000000 1050.00000

There are 25000 images in the dataset. We can see that the mean size of the images is (360.478080,404.09904).


In [40]:
df['category'].value_counts()


Out[40]:
cat    12500
dog    12500
Name: category, dtype: int64

In [7]:
%matplotlib inline
import seaborn as sns

ax = sns.countplot("category", data=df)



In [43]:
sns.jointplot(x='width', 
              y='height', 
              data=df,
              joint_kws={'s': 0.5}, 
              marginal_kws=dict(bins=50), 
              size=10,
              stat_func=None);


Data preparation

Number of training examples:


In [9]:
import os
TOTAL_NUMBER_FILES = sum([len(files) for r, d, files in os.walk(ORIGINAL_TRAIN_DIRECTORY)])
print("Total number of files in train folder:", TOTAL_NUMBER_FILES)


Total number of files in train folder: 25000

Folder structure

The train directory consist of labelled data with the following convention for each image:

data/train/CLASS.id.jpg

We are going to use keras.preprocessing.image so we want the folder structure to be:

data/train/CLASS/image-name.jpg


In [11]:
import glob
import os
import shutil
import numpy as np

shutil.rmtree(os.path.join(TEST_DIRECTORY, "dog"), ignore_errors=True)
shutil.rmtree(os.path.join(TEST_DIRECTORY, "cat"), ignore_errors=True)

shutil.rmtree(os.path.join(VALID_DIRECTORY, "dog"), ignore_errors=True)
shutil.rmtree(os.path.join(VALID_DIRECTORY, "cat"), ignore_errors=True)

shutil.rmtree(os.path.join(TRAIN_DIRECTORY, "dog"), ignore_errors=True)
shutil.rmtree(os.path.join(TRAIN_DIRECTORY, "cat"), ignore_errors=True)

os.mkdir(os.path.join(TEST_DIRECTORY, "dog"))
os.mkdir(os.path.join(TEST_DIRECTORY, "cat"))

os.mkdir(os.path.join(VALID_DIRECTORY, "dog"))
os.mkdir(os.path.join(VALID_DIRECTORY, "cat"))

os.mkdir(os.path.join(TRAIN_DIRECTORY, "dog"))
os.mkdir(os.path.join(TRAIN_DIRECTORY, "cat"))

In [12]:
#########################
# DOGS
##########
#random list of dog files
dog_pattern = ORIGINAL_TRAIN_DIRECTORY + "dog.*"
dog_files = np.random.permutation(glob.glob(dog_pattern))

# randomly split the files in train folder and move them to validation
number_validation_dog_files = int(len(dog_files) * VALIDATION_SIZE)
number_test_dog_files = int(len(dog_files) * TEST_SIZE)

for index, dog_file in enumerate(dog_files):
    file_name = os.path.split(dog_file)[1]
    if index < number_validation_dog_files:#validation files
        new_path = os.path.join(VALID_DIRECTORY, "dog",  file_name)
    elif index >= number_validation_dog_files and index < (number_validation_dog_files + number_test_dog_files):
        new_path = os.path.join(TEST_DIRECTORY, "dog",  file_name)
    else:
        new_path = os.path.join(TRAIN_DIRECTORY, "dog",  file_name)
    shutil.copy(dog_file, new_path)

#########################
# CATS
##########
#random list of dog files
cat_pattern = ORIGINAL_TRAIN_DIRECTORY + "cat.*"
cat_files = np.random.permutation(glob.glob(cat_pattern))

# randomly split the files in train folder and move them to validation
number_validation_cat_files = int(len(cat_files) * VALIDATION_SIZE)
number_test_cat_files = int(len(cat_files) * TEST_SIZE)

for index, cat_file in enumerate(cat_files):
    file_name = os.path.split(cat_file)[1]
    if index < number_validation_cat_files:
        new_path = os.path.join(VALID_DIRECTORY, "cat",  file_name)
    elif index >= number_validation_cat_files and index < (number_validation_cat_files+number_test_cat_files):
        new_path = os.path.join(TEST_DIRECTORY, "cat",  file_name)
    else:
        new_path = os.path.join(TRAIN_DIRECTORY, "cat",  file_name)
    shutil.copy(cat_file, new_path)

In [ ]:
## Samples

In [21]:
import utils;
from utils import *

batch_generator = get_keras_batch_generator(VALID_DIRECTORY, batch_size=2, target_size=(IMAGE_SIZE[0], IMAGE_SIZE[1]))

imgs,labels = next(batch_generator)


Found 5000 images belonging to 2 classes.

In [22]:
%matplotlib inline
plot_grid(imgs, titles=labels)



In [23]:
imgs,labels = next(batch_generator)
%matplotlib inline
plot_grid(imgs, titles=labels)



In [ ]: